any(is.na(train))
## [1] FALSE
any(is.na(submission))
## [1] FALSE
As both return FALSE, there are no missing values
glimpse(train)
## Observations: 36,168
## Variables: 17
## $ age <int> 50, 47, 56, 36, 41, 32, 26, 60, 39, 55, 32, 30, 35, ...
## $ job <fct> entrepreneur, technician, housemaid, blue-collar, ma...
## $ marital <fct> married, married, married, married, married, single,...
## $ education <fct> primary, secondary, primary, primary, primary, terti...
## $ default <fct> yes, no, no, no, no, no, no, no, no, no, no, no, no,...
## $ balance <int> 537, -938, 605, 4608, 362, 0, 782, 193, 2140, 873, 0...
## $ housing <fct> yes, yes, no, yes, yes, no, no, yes, yes, yes, no, y...
## $ loan <fct> no, no, no, no, no, no, no, no, no, yes, no, no, no,...
## $ contact <fct> unknown, unknown, cellular, cellular, cellular, cell...
## $ day <int> 20, 28, 19, 14, 12, 4, 29, 12, 16, 3, 19, 27, 21, 8,...
## $ month <fct> jun, may, aug, may, may, feb, jan, may, apr, jun, au...
## $ duration <int> 11, 176, 207, 284, 217, 233, 297, 89, 539, 131, 103,...
## $ campaign <int> 15, 2, 6, 7, 3, 3, 1, 2, 1, 1, 4, 3, 1, 2, 1, 8, 1, ...
## $ pdays <int> -1, -1, -1, -1, -1, 276, -1, -1, -1, -1, -1, -1, -1,...
## $ previous <int> 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome <fct> unknown, unknown, unknown, unknown, unknown, failure...
## $ y <fct> no, no, no, no, no, yes, no, no, no, no, no, no, no,...
glimpse(submission)
## Observations: 9,043
## Variables: 16
## $ age <int> 58, 43, 51, 56, 32, 54, 58, 54, 32, 38, 57, 51, 35, ...
## $ job <fct> management, technician, retired, management, blue-co...
## $ marital <fct> married, single, married, married, single, married, ...
## $ education <fct> tertiary, secondary, primary, tertiary, primary, sec...
## $ default <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ balance <int> 2143, 593, 229, 779, 23, 529, -364, 1291, 0, 424, 24...
## $ housing <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, ye...
## $ loan <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes...
## $ contact <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ month <fct> may, may, may, may, may, may, may, may, may, may, ma...
## $ duration <int> 261, 55, 353, 164, 160, 1492, 355, 266, 179, 104, 16...
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome <fct> unknown, unknown, unknown, unknown, unknown, unknown...
Comparing to training set with the submission set, we can tell the target variable is y
unique(train$y)
## [1] no yes
## Levels: no yes
As shown, it returns only yes and no, so it’s a binary classification problem
target_df <- data.frame(table(train$y))
colnames(target_df) <- c("target", "count")
ggplotly(
ggplot(data=target_df, aes(x=target, y=count, fill=target)) +
geom_bar(position = 'dodge', stat='identity', alpha=0.5) +
scale_fill_manual("targe t", values = c("yes" = "dodgerblue", "no"="firebrick1")) +
theme_classic()
)
nrow(subset(train, y =='yes')) / nrow(train)
## [1] 0.1159865
11% , no over/under-sampling needed
As shown from the hists, most of the numerical features are heavily skewed, and need to be normalized
As shown from the hists, most of the numerical features are heavily skewed, and need to be normalized
train_num <- dplyr::select_if(train, is.numeric)
res <- cor(train_num)
corrplot.mixed(
res,
upper="circle",
lower="number",
tl.col = "black",
number.cex = .8,
tl.cex=.8)
As shown from the result, there are no strong correlations between most of the pairs
Except the one for pdays and previous = 0.54
data<- aggregate(train$age, by=list(job = train$job, target = train$y), FUN = mean)
data <- dcast(data, job ~ target)
## Using 'x' as value column. Use 'value.var' to override
rnames <- data[,1]
mat_data <- data.matrix(data[,2:ncol(data)])
rownames(mat_data) <- rnames
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
col_breaks = c(seq(-1,0,length=100), # for red
seq(0,0.8,length=100), # for yellow
seq(0.81,1,length=100)) # for green
data
heatmap(mat_data,
margins =c(12,12), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks # enable color transition at specified limits
)
data<- aggregate(train$age, by=list(education = train$education, target = train$y), FUN = mean)
data <- dcast(data, education ~ target)
## Using 'x' as value column. Use 'value.var' to override
rnames <- data[,1]
mat_data <- data.matrix(data[,2:ncol(data)])
rownames(mat_data) <- rnames
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
my_palette <- colorRampPalette(c("red", "yellow", "green"))(n = 299)
col_breaks = c(seq(-1,0,length=100), # for red
seq(0,0.8,length=100), # for yellow
seq(0.81,1,length=100)) # for green
data
heatmap(mat_data,
margins =c(12,12), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks # enable color transition at specified limits
)
# ggplotly(
ggplot(train, aes(x=previous, y=pdays, color=poutcome)) +
geom_point(aes(size = poutcome), alpha = 0.1 ) +
scale_size_discrete(range = c(3, 8))+
scale_colour_hue()+
theme_classic()
## Warning: Using size for a discrete variable is not advised.
# )